{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A mini-`einsum` using loopy\n",
    "\n",
    "In this problem, we will design a function that carries out an `einsum`-type operation using `loopy`. It should be usable as shown in the tests towards the end of the worksheet. Also try to perform a simple parallelization so that your code will run on a GPU."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import numpy.linalg as la\n",
    "\n",
    "import pyopencl as cl\n",
    "import pyopencl.array\n",
    "import pyopencl.clmath\n",
    "import pyopencl.clrandom\n",
    "\n",
    "import loopy as lp\n",
    "\n",
    "from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some hints:\n",
    "\n",
    "* `loopy.Reduction(\"sum\", (\"i\", \"j\", \"k\"), expr)` expresses a sum.\n",
    "* Build the loop domain `{[i,j]: 0<=i<Ni and 0<=j<Nj}` as a string and pass it to loopy.\n",
    "* To build strings, use\n",
    "    * `str.join()`: `\",\".join(names)` and \n",
    "    * `str.format`: `\"Hi {name}\".format(name=\"Andreas\")`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "def loopy_einsum(queue, spec, *args):\n",
    "    arg_spec, out_spec = spec.split(\"->\")\n",
    "    arg_specs = arg_spec.split(\",\")\n",
    "    # ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#clear\n",
    "def loopy_einsum(queue, spec, *args):\n",
    "    arg_spec, out_spec = spec.split(\"->\")\n",
    "    arg_specs = arg_spec.split(\",\")\n",
    "\n",
    "    out_indices = set(out_spec)\n",
    "\n",
    "    all_indices = set(\n",
    "        idx\n",
    "        for argsp in arg_specs\n",
    "        for idx in argsp) | out_indices\n",
    "\n",
    "    sum_indices = all_indices - out_indices\n",
    "\n",
    "    from pymbolic import var\n",
    "    lhs = var(\"out\")[tuple(var(i) for i in out_spec)]\n",
    "\n",
    "    rhs = 1\n",
    "    for i, argsp in enumerate(arg_specs):\n",
    "        rhs = rhs * var(\"arg%d\" % i)[tuple(var(i) for i in argsp)]\n",
    "\n",
    "    if sum_indices:\n",
    "        rhs = lp.Reduction(\"sum\", tuple(var(idx) for idx in sum_indices), rhs)\n",
    "\n",
    "    constraints = \" and \".join(\n",
    "        \"0 <= %s < N%s\" % (idx, idx)\n",
    "        for idx in all_indices\n",
    "        )\n",
    "\n",
    "    domain = \"{[%s]: %s}\" % (\",\".join(all_indices), constraints)\n",
    "    knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])\n",
    "\n",
    "    knl = lp.set_options(knl, write_cl=True)\n",
    "\n",
    "    kwargs = {}\n",
    "    for i, arg in enumerate(args):\n",
    "        kwargs[\"arg%d\" % i] = arg\n",
    "\n",
    "    evt, (out,) = knl(queue, **kwargs)\n",
    "    return out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, let us test our implementation, using a simple matrix-matrix multiplication:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Choose platform:\n",
      "[0] <pyopencl.Platform 'Portable Computing Language' at 0x7f2748f856e8>\n",
      "[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x21df538>\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Choice [0]: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Set the environment variable PYOPENCL_CTX='0' to avoid being asked again.\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Ni, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nj, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nk, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ arg0, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ arg1, __global \u001b[36mdouble\u001b[39;49;00m *__restrict__ out)\n",
      "{\n",
      "  \u001b[36mdouble\u001b[39;49;00m acc_k;\n",
      "\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= -\u001b[34m1\u001b[39;49;00m + Nj; ++j)\n",
      "    \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + Ni >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + Nk >= \u001b[34m0\u001b[39;49;00m)\n",
      "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= -\u001b[34m1\u001b[39;49;00m + Ni; ++i)\n",
      "      {\n",
      "        acc_k = \u001b[34m0.0\u001b[39;49;00m;\n",
      "        \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k = \u001b[34m0\u001b[39;49;00m; k <= -\u001b[34m1\u001b[39;49;00m + Nk; ++k)\n",
      "          acc_k = acc_k + arg0[Nk * i + k] * arg1[Nj * k + j];\n",
      "        out[Nj * i + j] = acc_k;\n",
      "      }\n",
      "}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-3-4330b2eeb4dd>:31: DeprecationWarning: ExpressionInstruction is deprecated. Use Assignment instead\n",
      "  knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.3624569501569665e-12\n"
     ]
    }
   ],
   "source": [
    "cl_context = cl.create_some_context(interactive=True)\n",
    "queue = cl.CommandQueue(cl_context)\n",
    "\n",
    "a = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)\n",
    "b = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)\n",
    "\n",
    "ab = loopy_einsum(queue, \"ik,kj->ij\", a, b)\n",
    "\n",
    "diff =  a.get().dot(b.get()) - ab.get()\n",
    "\n",
    "print(la.norm(diff, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
